gffread /data/biodatabase/species/sugar_glider_slb_v1/genome/anno/sugarglider.gff3 -T -o /data/biodatabase/species/sugar_glider_slb_v1/genome/anno/sugarglider.gtf
1 Introduction
For more details, refer to ENCODE RNA-Seq pipeline.
2 Prepare necessary files
2.1 Convert GFF to GTF
2.2 Merge annotations if needed
2.2.1 Prepare input JSON file
{
"merge_anno.annotation": "gencode.v29.primary_assembly.annotation_UCSC_names.gtf.gz",
"merge_anno.tRNA": "gencode.v29.tRNAs.gtf.gz",
"merge_anno.spikeins": "ERCC_phiX.fa.gz",
"merge_anno.output_filename": "merged_annotation_V29.gtf.gz"
}
2.2.2 Merge annotations
caper run /data/softwares/encode_pipeline/rna-seq-pipeline_v1.2.4/make_index_wdl/merge_anno.wdl -c /data/softwares/encode_pipeline/caper/local.conf -i merge_anno_input.json --max-concurrent-tasks 2 --singularity
3 Build STAR index
3.1 Prepare input JSON file
{
"build_index.reference_sequence": "/data/biodatabase/species/sugar_glider_slb_v1/genome/genome/sugarglider.fasta.gz",
"build_index.spikeins": "/data/biodatabase/species/sugar_glider_slb_v1/genome/genome/ERCC_phix.spikeins.fasta.gz",
"build_index.annotation": "/data/biodatabase/species/sugar_glider_slb_v1/genome/anno/sugarglider.gtf.gz",
"build_index.anno_version": "v1",
"build_index.genome": "sugarglider",
"build_index.index_type": "prep_star",
"build_index.ncpu": 60,
"build_index.memGB": 512
}
3.2 Build STAR index
caper run /data/softwares/encode_pipeline/rna-seq-pipeline_v1.2.4/make_index_wdl/build_genome_index.wdl -c /data/softwares/encode_pipeline/caper/local.conf -i /data/biodatabase/species/sugar_glider_slb_v1/encode_references/bulk_rna_seq/star_index_input.json --max-concurrent-tasks 1 --singularity
4 Build RSEM index
4.1 Prepare input JSON file
{
"build_index.reference_sequence": "/data/biodatabase/species/sugar_glider_slb_v1/genome/genome/sugarglider.fasta.gz",
"build_index.spikeins": "/data/biodatabase/species/sugar_glider_slb_v1/genome/genome/ERCC_phix.spikeins.fasta.gz",
"build_index.annotation": "/data/biodatabase/species/sugar_glider_slb_v1/genome/anno/sugarglider.gtf.gz",
"build_index.anno_version": "v1",
"build_index.genome": "sugarglider",
"build_index.index_type": "prep_rsem",
"build_index.ncpu": 60,
"build_index.memGB": 512
}
4.2 Build RSEM index
caper run /data/softwares/encode_pipeline/rna-seq-pipeline_v1.2.4/make_index_wdl/build_genome_index.wdl -c /data/softwares/encode_pipeline/caper/local.conf -i /data/biodatabase/species/sugar_glider_slb_v1/encode_references/bulk_rna_seq/rsem_index_input.json --max-concurrent-tasks 1 --singularity
5 Prepare other files
In most cases, you need to modify the following code to suit yourself.
5.1 Prepare transcript IDs to gene biotypes mapping table
In this example, due to the lack of gene biotype field, we assign all transcripts with the gene biotype “protein_coding”.
library(rtracklayer)
library(tidyverse)
library(vroom)
<- "/data/biodatabase/species/sugar_glider_slb_v1/genome/anno/sugarglider.gtf.gz"
gtf_file <- "/data/biodatabase/species/sugar_glider_slb_v1/encode_references/bulk_rna_seq/sugarglider.v1.transcript_id_to_gene_type.tsv"
output_file <- "transcript"
transcript_field <- "protein_coding"
transcript_type
<- as.data.frame(rtracklayer::import(gtf_file, format = "gtf"))
df <- df %>%
df filter(type == transcript_field) %>%
mutate(transcript_type = transcript_type) %>%
select(transcript_id, transcript_type) %>%
arrange(transcript_id) %>%
distinct()
vroom_write(df, file = output_file, col_names = FALSE, append = FALSE)
5.2 Prepare gene/transcript IDs to names mapping table
library(YRUtils)
library(vroom)
<- "/data/biodatabase/species/sugar_glider_slb_v1/genome/anno/sugarglider.gff3.gz"
gff_file <- "transcript"
target_type
<- extract_gene_id_from_gff(gff_file, target_type = target_type)
df vroom_write(df, file = paste0(gff_file, ".", target_type, "_id_name_mapping_table.tsv"), col_names = TRUE, append = FALSE)